InĀ [1]:
import numpy as np
from scipy.stats import kendalltau
from sklearn.neighbors import KDTree as KDTree_whole
from scipy.spatial import KDTree as KDTree_layer
from sklearn.decomposition import NMF
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import random
from scipy import stats


from scipy.stats import hypergeom
InĀ [2]:
from pyannotate_runtime import collect_types

def rundifferential_test_AUC(df, metaClusterLabel, control, treatment, regulon_name, threshold_file, alpha = .05): """ Add a column to mark successful cells (greater than the mean enrichment score) M is the total number of objects, (all the cells = total population (low and high regulon activity) n is total number of Type I objects. (all cells that pass threshold) The random variate represents the number of Type I objects in N drawn without replacement from the total population.

k is the active amount of cells we are interested in
N the anumberof cells you are planning to sample from the tissue

sf(k, M, n, N, loc=0)
"""
#print(df.columns)
if threshold_file is None:
    calculate_mean = df[regulon_name].mean()
    df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > df[regulon_name].mean()
else:
    print("found file!")
    threshold = returnThreshold_dictionary(threshold_file, 'threshold')
    print(threshold)
    df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > threshold[regulon_name]


# Count successful control cells
control_successful_cells = df[(df[metaClusterLabel] == control) & (df['is_successful_{}'.format(regulon_name)])].shape[0]

# Count successful treatment cells
treatment_successful_cells = df[(df[metaClusterLabel] == treatment) & (df['is_successful_{}'.format(regulon_name)])].shape[0]

total_successful_cells = df[df['is_successful_{}'.format(regulon_name)]].shape[0]


total_population = df.shape[0]
n_1 = df[df[metaClusterLabel] == control].shape[0]
n_2 = df[df[metaClusterLabel] == treatment].shape[0]

print(control_successful_cells  - 1, total_population, total_successful_cells, n_1)

p_value_1 = hypergeom.sf(control_successful_cells  - 1, total_population, total_successful_cells, n_1)
p_value_2 = hypergeom.sf(treatment_successful_cells -1, total_population, total_successful_cells, n_2)

return(regulon_name, round(p_value_1, 2), round(p_value_2, 2))
InĀ [20]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import math 

def calculate_r2_and_coefficients(condx, condy):
    """Perform linear regression and return R² and coefficients."""
    regr = LinearRegression()
    regr.fit(condx, condy)
    y_pred = regr.predict(condx)
    rss = np.sum((condy - y_pred) ** 2)
    tss = np.sum((condy - np.mean(condy)) ** 2)
    r2 = r2_score(condy, y_pred)
    return r2, rss, tss, regr.coef_

def check_r2_consistency(r2, rss, tss):
    """Check if the calculated R² is consistent with the TSS and RSS."""
    return r2 == 1 - (rss / tss)

def calculate_metrics(r2, rss, tss, coef):
    """Calculate the final metrics."""
    r = math.sqrt(r2) * np.sign(coef)
    return round(r.item(),2), round(r2, 2), round(tss, 2), round(rss, 2)

def process_data_for_labels(labels, NMF_embedd):
    """Process the data for each label."""
    func_sep = {}
    for i in labels:
        condx = NMF_embedd[i][0].reshape(-1, 1)
        condy = NMF_embedd[i][1].reshape(-1, 1)
        
        r2, rss, tss, coef = calculate_r2_and_coefficients(condx, condy)
        
        if check_r2_consistency(r2, rss, tss):
            print(i, " works")
        
        r_metric, r2_metric, tss_metric, rss_metric = calculate_metrics(r2, rss, tss, coef)
        func_sep[i] = (r_metric, r2_metric, tss_metric, rss_metric)
    
    return func_sep
InĀ [21]:
class ComparisonTree:
    """ cond1 (control), data (RSS matrix), colnames are everything but the rownames, rownames (these are the regulons) """
    def __init__(self, cond1, ras_data, metaData_RAS_column, 
                 rss_data, colnames, rownames, threshold_file):
        self.cond1 = cond1
        self.data = rss_data
        self.RAS_AUC = ras_data
        self.labels = colnames
        self.sep = []
        self.map_me = {}
        self.NMF_embedd = {}
        self.h_sep = {}
        self.x = self.data[self.cond1].tolist()
        self.label = list(map( lambda x: x.split(" ")[0],self.data[rownames].tolist())) ## rownames of your regulons
        self.metaDataCol = metaData_RAS_column
        self.threshold_file = threshold_file
        self.order = []
        self.condition_label = []
        self.tau_p_value = {}

    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query_ball_tree.html
    def compareLayers(self, layer1, layer2, distance, font=6.5):
        plt.figure(figsize=(6, 6))
        cond1x, cond1y = self.NMF_embedd[layer1][0], self.NMF_embedd[layer1][1]
        cond2x, cond2y = self.NMF_embedd[layer2][0], self.NMF_embedd[layer2][1] # you can compare multiple experiments here
        
        plt.plot(cond1x, cond1y, "xk", markersize=10, label=layer1)
        plt.plot(cond2x, cond2y, "og", markersize=10, label=layer2)
        
        compare_layers = self.NMF_embedd[layer1][4].query_ball_tree(self.NMF_embedd[layer2][4], r=distance)

        for i in range(len(compare_layers)):
            for j in compare_layers[i]:
                plt.plot([cond1x[i], cond2x[i]], [cond1y[i], cond2y[i]], "-b")
        
        for i in range(len(self.NMF_embedd[layer1][0])):  
            plt.text(cond1x[i], cond1y[i], 
                    self.label[i], fontsize=font, ha='right', color='red')
        
        for i in range(len(self.NMF_embedd[layer2][0])):  
            plt.text(cond2x[i], cond2y[i], self.label[i], fontsize=font, ha='left', color='green')
        
        plt.title("distance < {} between {} and {}".format(distance, layer1, layer2))
        plt.xlabel("NMF_base ({})".format(layer1))
        plt.ylabel("NMF_exp ({})".format(layer2))
        plt.legend()
        plt.savefig("Layer {} and {}".format(layer1, layer2))
        
        # Show the plot
        plt.show()

    def calc_global_dictonary(self):
        dict_label = list(np.array([self.label] * len(self.condition_label)).flatten())
        dict_label = list(zip([self.cond1] * len(self.condition_label), self.condition_label, dict_label))
        Idx_embedd = {}
        i = 0
        for ctrl_ref, cond, regulon in dict_label:
            Idx_embedd[i] = ctrl_ref + "-" + cond + ":\t" + regulon
            i += 1
        return Idx_embedd # this is the global dictionary where we can query the entire 3D structure

    def create_global_tree(self, leaf_size = 10): ## can be expensive
        """ what if we want to compare the strcuture of one SCENIC run and do it with another? checking reproducibility"""
        
        X, Y, Z = self.construct_3D_embedding()
        Idx_embedd = self.calc_global_dictonary()
        NMF_XYZ = np.column_stack(((X, Y, Z)))
        entire_tree = KDTree_whole(NMF_XYZ, leaf_size) # query the entire tree structure! #scikit will leaarn the structure 
        return entire_tree, Idx_embedd
   
        
    def compute_tau_and_kdtree(self):
        """Compute Kendall's Tau and KDTree for each label."""
        for i in self.labels:
            y = self.data[i].tolist()
            tau, p_value = kendalltau(self.x, y)
            self.tau_p_value[i] = (tau, p_value)
            query_kdtree = KDTree_layer(np.column_stack((self.x, y)))
            cond1x, condy, test1, h = self.nmf_transform(np.array(list(zip(self.x, y))))
            self.h_sep[i] = h
            self.NMF_embedd[i] = (cond1x, condy, test1, tau, query_kdtree)
            self.sep.append(tau)

    def return_the_tau_pval(self):
        return self.tau_p_value
            
    def analyze_factors(self, cond2, percentages = False):
        H = self.h_sep[cond2]
        def plotHeatmap(H):
            
            sns.heatmap(H, annot=True, cmap="YlGnBu", 
                        xticklabels=[self.cond1 + " (control)", cond2], 
                        yticklabels=[f'NMF_{i+1}' for i in range(H.shape[0])])
            
            plt.title('Percentages (H) with Vectors {} and {}'.format(self.cond1, cond2) if percentages 
                     else 'Coefficient Matrix (H) with Vectors {} and {}'.format(self.cond1, cond2))
            plt.tight_layout()
            plt.savefig('{}_{}.png'.format(self.cond1, cond2))  
            plt.show()
        plt.figure(figsize=(8, 6))  
        if percentages:
            components = []
            for i in H:
                components.append([item/sum(i) for item in i])
            plotHeatmap(np.array(components))
        else:
            plotHeatmap(H)
            
    def nmf_transform(self, rss_data):
        """NMF transfomration"""
        n_components = 2  # Reduce to 2 components for visualization
        nmf = NMF(n_components=n_components, init='random', random_state=42)
        embedding = nmf.fit_transform(rss_data)
        H_matrix = nmf.components_
        return(embedding[:, 0], embedding[:, 1], embedding, H_matrix)

    def map_labels_to_tau(self):
        """Map tau values to labels, handling duplicates."""
        for i, lbl in list(zip(self.sep, self.labels)):
            if i in self.map_me:  # Just in case we have the same tau value
                prev_value = self.map_me[i]
                print(lbl, "warning: has a dup")
                self.map_me[i] = (prev_value, lbl)
            else:
                self.map_me[i] = lbl  # Make the tau into a key

    def sort_and_print_labels(self):
        """Sort tau values and print corresponding labels."""
        self.sep = sorted(self.sep)
        for i in self.sep:
            print(self.map_me[i], i)

    def init_order(self): ## how to construct the 3d embedding space
        order = [] 
        condition_label = []

        ## make sure self.spe is sorted
        for i in sorted(list(set(self.sep))):
            if isinstance(self.map_me[i], str):
                #print("path A")
                order.append(self.map_me[i])
                #print(order)
            else:
                print("path B")
                for i in self.map_me[i]:
                    order.append(i)

        item = order[len(order) - 1]
        order = order[:-1]
        #print("getting rid of {}".format(item))
        # remove one from the top, this is after order is stacked here
        for i in order:
            print(i)
            condition_label.append([i] * len(self.label))

        self.order = order # subtract the one here!
        self.condition_label = np.array(condition_label).flatten()
        
        
    def construct_3D_embedding(self, rawRSS = False):
        X, Y, Z = [], [], []
        def prepare_data(data_source):
            """ Helper function to prepare the data for plotting """
            nonlocal X, Y, Z
            for i in self.order:
                x = data_source[self.cond1].tolist() if rawRSS else self.NMF_embedd[i][0]
                y = data_source[i].tolist() if rawRSS else self.NMF_embedd[i][1]
                Z_val = [self.NMF_embedd[i][3]] * len(self.label)
                X.extend(x)
                Y.extend(y)
                Z.extend(Z_val)
        
        if rawRSS:
            prepare_data(self.data)
        else:
            prepare_data(self.NMF_embedd)

        # Convert lists to np.array and flatten them
        X = np.array(X).flatten()
        Y = np.array(Y).flatten()
        Z = np.array(Z).flatten()

        return(X, Y, Z)

    def initDict(self, label):
        dict = {}
        for i in range(len(label)):
            dict[i] = label[i]
        return dict
        
    def plot_3dEmbedding(self, rawRSS = False, regulonsToView = [], clustersToLabel = []):
        condition_color_map = {}
        for i in self.order: # create the color mappings!
            color = f'#{random.randint(0, 0xFFFFFF):06x}'
            condition_color_map[i] = color
            
        X, Y, Z = self.construct_3D_embedding(rawRSS)
        
        # Create the figure and 3D axis
        fig = plt.figure(figsize=(15, 17))
        ax = fig.add_subplot(111, projection='3d')

        print(self.labels)
        dict = self.initDict(self.label)

        if len(clustersToLabel) == 0:
            clustersToLabel = list(set(self.condition_label))
        
        print(self.label)

        print(condition_color_map)
        
        if len(regulonsToView) > 0:
            for i in range(len(X)):
                item = dict[i % len(self.label)]
                if item in regulonsToView and self.condition_label[i] in clustersToLabel:
                    ax.text(X[i], Y[i], Z[i], s=str(item), 
                            color=condition_color_map[self.condition_label[i]], 
                            fontsize=12, fontweight='bold', style='italic')
                

        # Plot the data points for each condition
        for condition in np.unique(self.condition_label):
            mask = self.condition_label == condition
            ax.scatter(X[mask], Y[mask], Z[mask], color=condition_color_map[condition], label=condition)
    
    
        # Set the labels and title
        ax.set_xlabel(f'RSS (control) {self.cond1}' if rawRSS else 'NMF_1')
        ax.set_ylabel('RSS treatment' if rawRSS else 'NMF_2')
        ax.set_zlabel("Kendall's Tau")
        ax.set_title(f'3D Plot of {self.cond1} vs Conditions with Kendall\'s Tau using NMF reduction' 
                     if not rawRSS else f"3D Plot of RSS scores {self.cond1} vs Conditions with Kendall\'s Tau")
    
        # Add legend
        ax.legend()
    
        # Show the plot
        plt.show()
    
    def construct_tree(self):
        """Construct the entire tree of processing and print the result."""
        self.compute_tau_and_kdtree()
        self.map_labels_to_tau()
        self.sort_and_print_labels()# this is how the 3D embedding will be assembled
        self.init_order() # we are assemling the tree!

    def plotRSS_NMF(self, cond2, drawQuadrants = True, include_pvals = False, alpha = 0.05, tryLabel = ""):
        """Visualize the 2D embeddings between control and treatment group"""
        cond1x = self.NMF_embedd[cond2][0]
        condy = self.NMF_embedd[cond2][1]
        test1 = self.NMF_embedd[cond2][2]
        p_vals = {}
        
        plt.scatter(cond1x, condy)

        def returnThreshold_dictionary(threshold_file, regulonC = None):
            thresholds, regulonNames, regulonThresholds = None, [], []
            
            if "3.5_" in threshold_file:
                thresholds = pd.read_csv(threshold_file, sep="\t").T
                regulonNames = list(map(lambda x: x.split(" ")[0], thresholds.loc["regulon"].tolist()))
                regulonThresholds = thresholds.loc[regulonC] # by the row.
            else:
                thresholds = pd.read_csv(threshold_file)
                regulonNames = list(map(lambda x: x.split(" ")[0], thresholds[regulonC]))
                regulonThreshold = thresholds[regulonC] # column must be named like this ...
                
            threshold_dict = {}
            
            for i in range(len(regulonNames)):
                threshold_dict[regulonNames[i]] = regulonThresholds[i]
                i += 1
            
            return threshold_dict

        def rundifferential_test_AUC(df, metaClusterLabel, control, treatment, regulon_name, threshold_file, alpha = .05):
            """
            Add a column to mark successful cells (greater than the mean enrichment score)
            M is the total number of objects,  (all the cells = total population (low and high regulon activity)
            n is total number of Type I objects.  (all cells that pass threshold)
            The random variate represents the number of Type I objects
            in N drawn without replacement from the total population. 
        
            k is the active amount of cells we are interested in
            N the anumberof cells you are planning to sample from the tissue
        
            sf(k, M, n, N, loc=0)
            """
            df.columns = list(map(lambda x: x.split(" ")[0], df.columns)) 
            if threshold_file is None:
                calculate_mean = df[regulon_name].mean()
                df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > df[regulon_name].mean()
            else:
                threshold = returnThreshold_dictionary(threshold_file, 'threshold')
                #print(threshold[regulon_name], regulon_name)
                df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > threshold[regulon_name]
            
            # Count successful control cells
            control_successful_cells = df[(df[metaClusterLabel] == control) & (df['is_successful_{}'.format(regulon_name)])].shape[0]
            
            # Count successful treatment cells
            treatment_successful_cells = df[(df[metaClusterLabel] == treatment) & (df['is_successful_{}'.format(regulon_name)])].shape[0]    
            total_successful_cells = df[df['is_successful_{}'.format(regulon_name)]].shape[0]
        
            
            total_population = df.shape[0]
            n_1 = df[df[metaClusterLabel] == control].shape[0]
            n_2 = df[df[metaClusterLabel] == treatment].shape[0]
        
            p_value_1 = hypergeom.sf(control_successful_cells  - 1, total_population, total_successful_cells, n_1)
            p_value_2 = hypergeom.sf(treatment_successful_cells -1, total_population, total_successful_cells, n_2)
        
            return(regulon_name, round(p_value_1, 2), round(p_value_2, 2))

        def drawText(cond1x, condy):
            i = 0
            for x_2, y_2 in list(zip(cond1x, condy)):
                text = self.label[i]
                if include_pvals:
                    #rundifferential_test_AUC(df, control, treatment, regulon_name, alpha = .05)
                    min_font_size = 6
                    max_font_size = 12

                    chk = self.threshold_file
                    _, _, p_value_treatment= rundifferential_test_AUC(self.RAS_AUC, 
                                                                       self.metaDataCol, 
                                                                       self.cond1, 
                                                                       cond2, 
                                                                       text, 
                                                                       chk)
                    
                    color = 'red' if p_value_treatment < alpha else 'black'
                    
                    if color == 'black':
                        font_size = min_font_size
                    else:
                        font_size =  max(min_font_size, min(max_font_size, max_font_size / (p_value_treatment * 5)))
                    
                    plt.text(x_2, y_2, text, ha='center', va='bottom', fontsize= font_size, fontweight='bold', color=color)  # the bigger the p_value, smaller text
                    p_vals[text] = p_value_treatment
                else:
                    plt.text(x_2, y_2, text, ha='center', va='bottom', fontsize=6.5, fontweight='bold')
                i += 1


        drawText(cond1x, condy)
        #plt.title("NMF Embedding of RSS Values {} and {}".format(self.cond1, cond2))
        plt.xlabel("NMF_base ({})".format(self.cond1))
        plt.ylabel("NMF_exp ({})".format(cond2))

        if drawQuadrants:
            plt.axvline(x = np.median(cond1x), color="black", linestyle="--")
            plt.axhline(y = np.median(condy), color="black", linestyle="--")
        
        plt.savefig('{}_{}.png'.format(self.cond1,cond2))     
        plt.show()
        
        kmeans = KMeans(n_clusters=2, random_state=42)
        kmeans.fit(test1)
        plt.scatter(cond1x, condy, c=kmeans.labels_, cmap='viridis')
        t2 = drawText(cond1x, condy)

        if drawQuadrants:
            plt.axvline(x = np.median(cond1x), color="black", linestyle="--")
            plt.axhline(y = np.median(condy), color="black", linestyle="--")
            
        plt.title("K-Means Clustering after NMF {} and {}".format(self.cond1, cond2), fontsize=8.5)
        plt.xlabel("NMF_base {}".format(self.cond1))
        plt.ylabel("NMF_exp {}".format(cond2))
        #plt.savefig('{}_{}.png'.format(self.cond1,cond2))  # Save as PNG
        #plt.colorbar(label="Cluster Label")
        plt.show()

        return p_vals
InĀ [22]:
import warnings
warnings.filterwarnings("ignore")
InĀ [23]:
comparison2.order
Out[23]:
['Acute-Infected_blood_CD4+ CTL',
 'Uninfected_blood_CD4+ CTL',
 'Acute-Infected_brain_CD4+ CTL']
InĀ [24]:
comparison2.plot_3dEmbedding()
['Uninfected_blood_CD4+ CTL', 'Acute-Infected_blood_CD4+ CTL', 'Uninfected_brain_CD4+ CTL', 'Acute-Infected_brain_CD4+ CTL']
['RUNX2', 'ETS1', 'TFDP2', 'NR3C1', 'ELF1', 'BDP1', 'NRF1', 'TAF1', 'EP300', 'JUN', 'CREM', 'REL', 'FOS', 'ATF3', 'FOSL2', 'E2F3', 'TBX21', 'EOMES', 'IRF8', 'KLF10', 'BRCA1', 'E2F1', 'TFDP1', 'JUND', 'IRF9', 'RFX2', 'KLF8', 'MYB', 'CUX1', 'HIF1A', 'MYC', 'NFATC1', 'NFKB1', 'CHD2', 'JUNB', 'GABPA', 'YY1', 'SMARCA4', 'MAX', 'ATF7', 'ATF6', 'STAT2', 'IRF2', 'SREBF2', 'POLR2A', 'RELB', 'STAT5A', 'IRF1', 'IRF7', 'STAT1', 'RELA', 'PBX3', 'POLE3', 'ELF2']
{'Acute-Infected_blood_CD4+ CTL': '#64616e', 'Uninfected_blood_CD4+ CTL': '#aef150', 'Acute-Infected_brain_CD4+ CTL': '#f8bf8c'}
No description has been provided for this image
InĀ [25]:
regulon_sig2
Out[25]:
{'Uninfected_blood_CD4+ CTL': {'RUNX2': 0.0,
  'ETS1': 0.49,
  'TFDP2': 1.0,
  'NR3C1': 1.0,
  'ELF1': 1.0,
  'BDP1': 1.0,
  'NRF1': 1.0,
  'TAF1': 1.0,
  'EP300': 1.0,
  'JUN': 1.0,
  'CREM': 0.88,
  'REL': 1.0,
  'FOS': 0.99,
  'ATF3': 1.0,
  'FOSL2': 0.0,
  'E2F3': 0.17,
  'TBX21': 0.0,
  'EOMES': 0.94,
  'IRF8': 1.0,
  'KLF10': 0.13,
  'BRCA1': 1.0,
  'E2F1': 1.0,
  'TFDP1': 1.0,
  'JUND': 0.98,
  'IRF9': 0.02,
  'RFX2': 0.45,
  'KLF8': 0.98,
  'MYB': 1.0,
  'CUX1': 1.0,
  'HIF1A': 1.0,
  'MYC': 1.0,
  'NFATC1': 0.9,
  'NFKB1': 1.0,
  'CHD2': 1.0,
  'JUNB': 1.0,
  'GABPA': 0.8,
  'YY1': 0.49,
  'SMARCA4': 1.0,
  'MAX': 0.04,
  'ATF7': 1.0,
  'ATF6': 0.97,
  'STAT2': 0.02,
  'IRF2': 0.69,
  'SREBF2': 1.0,
  'POLR2A': 1.0,
  'RELB': 1.0,
  'STAT5A': 0.0,
  'IRF1': 0.0,
  'IRF7': 0.0,
  'STAT1': 0.0,
  'RELA': 0.0,
  'PBX3': 1.0,
  'POLE3': 1.0,
  'ELF2': 0.51},
 'Acute-Infected_blood_CD4+ CTL': {'RUNX2': 1.0,
  'ETS1': 0.01,
  'TFDP2': 1.0,
  'NR3C1': 1.0,
  'ELF1': 1.0,
  'BDP1': 0.99,
  'NRF1': 1.0,
  'TAF1': 1.0,
  'EP300': 1.0,
  'JUN': 1.0,
  'CREM': 0.0,
  'REL': 1.0,
  'FOS': 1.0,
  'ATF3': 1.0,
  'FOSL2': 0.0,
  'E2F3': 1.0,
  'TBX21': 0.0,
  'EOMES': 1.0,
  'IRF8': 1.0,
  'KLF10': 0.23,
  'BRCA1': 1.0,
  'E2F1': 1.0,
  'TFDP1': 1.0,
  'JUND': 0.9,
  'IRF9': 0.01,
  'RFX2': 1.0,
  'KLF8': 1.0,
  'MYB': 1.0,
  'CUX1': 1.0,
  'HIF1A': 1.0,
  'MYC': 1.0,
  'NFATC1': 1.0,
  'NFKB1': 1.0,
  'CHD2': 1.0,
  'JUNB': 1.0,
  'GABPA': 1.0,
  'YY1': 1.0,
  'SMARCA4': 0.79,
  'MAX': 0.77,
  'ATF7': 1.0,
  'ATF6': 1.0,
  'STAT2': 0.96,
  'IRF2': 1.0,
  'SREBF2': 1.0,
  'POLR2A': 1.0,
  'RELB': 1.0,
  'STAT5A': 1.0,
  'IRF1': 0.15,
  'IRF7': 0.98,
  'STAT1': 0.07,
  'RELA': 0.0,
  'PBX3': 1.0,
  'POLE3': 1.0,
  'ELF2': 0.3},
 'Uninfected_brain_CD4+ CTL': {'RUNX2': 0.0,
  'ETS1': 1.0,
  'TFDP2': 0.0,
  'NR3C1': 0.0,
  'ELF1': 0.0,
  'BDP1': 0.0,
  'NRF1': 0.2,
  'TAF1': 1.0,
  'EP300': 1.0,
  'JUN': 1.0,
  'CREM': 0.96,
  'REL': 0.99,
  'FOS': 0.0,
  'ATF3': 0.0,
  'FOSL2': 0.0,
  'E2F3': 0.0,
  'TBX21': 0.0,
  'EOMES': 0.08,
  'IRF8': 0.0,
  'KLF10': 0.0,
  'BRCA1': 1.0,
  'E2F1': 1.0,
  'TFDP1': 1.0,
  'JUND': 0.89,
  'IRF9': 0.99,
  'RFX2': 1.0,
  'KLF8': 1.0,
  'MYB': 1.0,
  'CUX1': 1.0,
  'HIF1A': 1.0,
  'MYC': 1.0,
  'NFATC1': 1.0,
  'NFKB1': 1.0,
  'CHD2': 1.0,
  'JUNB': 1.0,
  'GABPA': 0.0,
  'YY1': 1.0,
  'SMARCA4': 0.03,
  'MAX': 0.0,
  'ATF7': 0.0,
  'ATF6': 0.0,
  'STAT2': 0.0,
  'IRF2': 0.23,
  'SREBF2': 0.0,
  'POLR2A': 0.0,
  'RELB': 1.0,
  'STAT5A': 0.04,
  'IRF1': 0.0,
  'IRF7': 0.99,
  'STAT1': 1.0,
  'RELA': 1.0,
  'PBX3': 1.0,
  'POLE3': 0.01,
  'ELF2': 0.0},
 'Acute-Infected_brain_CD4+ CTL': {'RUNX2': 1.0,
  'ETS1': 1.0,
  'TFDP2': 1.0,
  'NR3C1': 1.0,
  'ELF1': 1.0,
  'BDP1': 0.17,
  'NRF1': 1.0,
  'TAF1': 1.0,
  'EP300': 1.0,
  'JUN': 1.0,
  'CREM': 0.0,
  'REL': 0.51,
  'FOS': 0.0,
  'ATF3': 0.0,
  'FOSL2': 0.0,
  'E2F3': 0.16,
  'TBX21': 0.0,
  'EOMES': 1.0,
  'IRF8': 1.0,
  'KLF10': 0.0,
  'BRCA1': 1.0,
  'E2F1': 1.0,
  'TFDP1': 1.0,
  'JUND': 0.96,
  'IRF9': 0.0,
  'RFX2': 1.0,
  'KLF8': 1.0,
  'MYB': 1.0,
  'CUX1': 1.0,
  'HIF1A': 1.0,
  'MYC': 1.0,
  'NFATC1': 1.0,
  'NFKB1': 1.0,
  'CHD2': 1.0,
  'JUNB': 1.0,
  'GABPA': 0.0,
  'YY1': 0.93,
  'SMARCA4': 0.03,
  'MAX': 0.01,
  'ATF7': 1.0,
  'ATF6': 0.79,
  'STAT2': 0.16,
  'IRF2': 0.43,
  'SREBF2': 0.61,
  'POLR2A': 0.0,
  'RELB': 0.98,
  'STAT5A': 1.0,
  'IRF1': 0.0,
  'IRF7': 0.99,
  'STAT1': 0.03,
  'RELA': 0.0,
  'PBX3': 1.0,
  'POLE3': 0.04,
  'ELF2': 0.0}}

perm testing¶

InĀ [161]:
def plot(ax, cond1, cond2, z_vals, pdf):  # we'll reuse this
    ax.plot(z_vals, pdf)
    ax.set_title("Kendall Tau Test Null Distribution between {} and {}".format(cond1, cond2))
    ax.set_xlabel("statistic")
    ax.set_ylabel("probability density")
    
def nullDist(data, control, treatment):
    n = len(data[control])  
    var = (2*(2*n + 5)) / (9*n*(n - 1))
    dist = stats.norm(scale=np.sqrt(var))
    z_vals = np.linspace(-1.25, 1.25, 100)
    pdf = dist.pdf(z_vals)
    fig, ax = plt.subplots(figsize=(8, 5))
    
    plot(ax, control, treatment, z_vals, pdf)
    plt.show()
    return dist, z_vals, pdf
InĀ [162]:
def final_permutation_Test(data, control, treatment):
    dist, z_vals, pdf = nullDist(data, control, treatment)
    
    x = data[control]
    y = data[treatment]
    
    def statistic(x):  # explore all possible pairings by permuting `x`
        return stats.kendalltau(x, y).statistic  # ignore pvalue
        
    ref = stats.permutation_test((x,), statistic,
                                 permutation_type='pairings')
    
    fig, ax = plt.subplots(figsize=(8, 5))
    plot(ax, control, treatment, z_vals, pdf)
    bins = np.linspace(-1.25, 1.25, 25)
    ax.hist(ref.null_distribution, bins=bins, density=True)
    ax.legend(['asymptotic approximation\n(many observations)',
               'exact null distribution'])
    plot(ax, control, treatment, z_vals, pdf)
    plt.show()
    return ref.pvalue
InĀ [311]:
final_permutation_Test(pbData,"Naive CD4 T", "CD14+ Mono")
No description has been provided for this image
No description has been provided for this image
Out[311]:
0.0038
InĀ [312]:
comparison2.compareLayers("Uninfected_brain_CD4+ CTL", "Acute-Infected_brain_CD4+ CTL", .045) ## with the control
No description has been provided for this image
InĀ [24]:
pbData.columns
Out[24]:
Index(['Unnamed: 0', 'Memory CD4 T', 'B', 'CD14+ Mono', 'NK', 'CD8 T',
       'FCGR3A+ Mono', 'Naive CD4 T', 'DC', 'Platelet'],
      dtype='object')
InĀ [11]:
pbData = pd.read_csv("PBMC_study/QA_QC_PBMC_rss_values_Feb3.csv")
df_pbmc_RAS = pd.read_csv("PBMC_study/obj_AUC_metadata_PBMC_2_Feb22.csv")
labels3 = pbData.columns[1:-1].tolist()


PBMC_cell_Types = [
    "Naive CD4 T",
    "FCGR3A+ Mono",
    "CD14+ Mono",
    "Memory CD4 T",
    'DC',
    'NK',
    'B'
#    "CD8 T"
]
comparisonPBMC = ComparisonTree("Naive CD4 T", df_pbmc_RAS, "cell_clusters", pbData, PBMC_cell_Types, "Unnamed: 0", "PBMC_study/3.5_AUCellThresholds_Info_PVMC_QA_QC.tsv")
comparisonPBMC.construct_tree()



regulon_sig_PB = {}
for i in PBMC_cell_Types:
    p_vals = comparisonPBMC.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
    regulon_sig_PB[i] = p_vals

comparisonPBMC.plot_3dEmbedding()
FCGR3A+ Mono -0.3754152823920266
CD14+ Mono -0.3023255813953488
DC -0.22480620155038758
NK 0.16500553709856036
B 0.46843853820598
Memory CD4 T 0.769656699889258
Naive CD4 T 0.9999999999999999
FCGR3A+ Mono
CD14+ Mono
DC
NK
B
Memory CD4 T
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
['Naive CD4 T', 'FCGR3A+ Mono', 'CD14+ Mono', 'Memory CD4 T', 'DC', 'NK', 'B']
['SPIB', 'RFX5', 'IRF8', 'NFE2L2', 'IRF1', 'RELB', 'CEBPA', 'FOSL2', 'IRF7', 'NFE2', 'KLF4', 'SPI1', 'POLE3', 'STAT1', 'IRF2', 'TBL1XR1', 'MAX', 'ATF3', 'KLF11', 'ELF1', 'TRIM69', 'YY1', 'REL', 'KLF12', 'GTF2B', 'UTP18', 'KLF13', 'CTCF', 'THAP1', 'DBP', 'XBP1', 'TBX21', 'HOXB2', 'ATF1', 'ELK3', 'POLR2A', 'BCLAF1', 'ZNF143', 'ETS1', 'FLI1', 'KLF2', 'ELF2', 'ATF4']
{'FCGR3A+ Mono': '#f6405c', 'CD14+ Mono': '#13de99', 'DC': '#8579ca', 'NK': '#033f63', 'B': '#e96a19', 'Memory CD4 T': '#ecbb49'}
No description has been provided for this image
InĀ [13]:
pbData = pd.read_csv("PBMC_study/QA_QC_PBMC_rss_values_Feb3.csv")
df_pbmc_RAS = pd.read_csv("PBMC_study/obj_AUC_metadata_PBMC_2_Feb22.csv")
labels3 = pbData.columns[1:-1].tolist()


PBMC_cell_Types = [
    "Naive CD4 T",
    "FCGR3A+ Mono",
    "CD14+ Mono",
    "Memory CD4 T",
    "CD8 T",
    "NK",
    "DC",
    "B"
]
comparisonPBMC = ComparisonTree("Naive CD4 T", df_pbmc_RAS, "cell_clusters", pbData, PBMC_cell_Types, "Unnamed: 0", "PBMC_study/3.5_AUCellThresholds_Info_PVMC_QA_QC.tsv")
comparisonPBMC.construct_tree()



regulon_sig_PB = {}
for i in PBMC_cell_Types:
    p_vals = comparisonPBMC.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
    regulon_sig_PB[i] = p_vals

comparisonPBMC.plot_3dEmbedding()
FCGR3A+ Mono -0.3754152823920266
CD14+ Mono -0.3023255813953488
DC -0.22480620155038758
NK 0.16500553709856036
B 0.46843853820598
CD8 T 0.49058693244739754
Memory CD4 T 0.769656699889258
Naive CD4 T 0.9999999999999999
FCGR3A+ Mono
CD14+ Mono
DC
NK
B
CD8 T
Memory CD4 T
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
['Naive CD4 T', 'FCGR3A+ Mono', 'CD14+ Mono', 'Memory CD4 T', 'CD8 T', 'NK', 'DC', 'B']
['SPIB', 'RFX5', 'IRF8', 'NFE2L2', 'IRF1', 'RELB', 'CEBPA', 'FOSL2', 'IRF7', 'NFE2', 'KLF4', 'SPI1', 'POLE3', 'STAT1', 'IRF2', 'TBL1XR1', 'MAX', 'ATF3', 'KLF11', 'ELF1', 'TRIM69', 'YY1', 'REL', 'KLF12', 'GTF2B', 'UTP18', 'KLF13', 'CTCF', 'THAP1', 'DBP', 'XBP1', 'TBX21', 'HOXB2', 'ATF1', 'ELK3', 'POLR2A', 'BCLAF1', 'ZNF143', 'ETS1', 'FLI1', 'KLF2', 'ELF2', 'ATF4']
{'FCGR3A+ Mono': '#c63b25', 'CD14+ Mono': '#efef35', 'DC': '#814652', 'NK': '#0968c5', 'B': '#cff513', 'CD8 T': '#da04f6', 'Memory CD4 T': '#0f6ff9'}
No description has been provided for this image
InĀ [316]:
comparisonPBMC.plot_3dEmbedding(regulonsToView=["FOSL2", "RFX5", "CEBPA"], clustersToLabel=["CD14+ Mono", "FCGR3A+ Mono"])
['Naive CD4 T', 'FCGR3A+ Mono', 'CD14+ Mono', 'Memory CD4 T']
['SPIB', 'RFX5', 'IRF8', 'NFE2L2', 'IRF1', 'RELB', 'CEBPA', 'FOSL2', 'IRF7', 'NFE2', 'KLF4', 'SPI1', 'POLE3', 'STAT1', 'IRF2', 'TBL1XR1', 'MAX', 'ATF3', 'KLF11', 'ELF1', 'TRIM69', 'YY1', 'REL', 'KLF12', 'GTF2B', 'UTP18', 'KLF13', 'CTCF', 'THAP1', 'DBP', 'XBP1', 'TBX21', 'HOXB2', 'ATF1', 'ELK3', 'POLR2A', 'BCLAF1', 'ZNF143', 'ETS1', 'FLI1', 'KLF2', 'ELF2', 'ATF4']
{'FCGR3A+ Mono': '#04edb4', 'CD14+ Mono': '#d44efd', 'Memory CD4 T': '#83e710'}
No description has been provided for this image
InĀ [187]:
comparisonPBMC.compareLayers("Naive CD4 T", "Memory CD4 T", .015)
No description has been provided for this image
InĀ [166]:
data.columns
Out[166]:
Index(['Unnamed: 0', 'Acute-Infected_blood_CD4+ CTL',
       'Acute-Infected_blood_CD4+ Naive', 'Acute-Infected_blood_CD4+ EM',
       'Acute-Infected_blood_CD4+ Treg',
       'Acute-Infected_blood_CD4+ Proliferating',
       'Acute-Infected_brain_CD8+ Effector', 'Acute-Infected_brain_CD4+ CTL'],
      dtype='object')
InĀ [104]:
acute_infected_cell_types
Out[104]:
['Acute-Infected_blood_CD4+ Naive',
 'Acute-Infected_blood_CD4+ Treg',
 'Acute-Infected_blood_CD4+ EM',
 'Acute-Infected_blood_CD4+ Proliferating',
 'Acute-Infected_blood_CD4+ CTL']
InĀ [163]:
data = pd.read_csv("SIV_monkey/rss_AcuteInfected.csv") ## this would be one comparison (RSS)
df_RAS = pd.read_csv("SIV_monkey/RAS_AUC_AcuteInfected.csv") ## grab this from your SCENIC stuff, include ALL METADATA AUC AND cellLabels

acute_infected_cell_types = [
                            "Acute-Infected_blood_CD4+ Naive",
                            "Acute-Infected_blood_CD4+ Treg",
                            "Acute-Infected_blood_CD4+ EM",
                            "Acute-Infected_blood_CD4+ Proliferating",
                            "Acute-Infected_blood_CD4+ CTL"
                            ]


compariso3n = ComparisonTree('Acute-Infected_blood_CD4+ CTL', df_RAS, "SCENIC_RSS_3", data, acute_infected_cell_types, "Unnamed: 0", "SIV_monkey/3.5_AUCellThresholds_Info_Acute_infected.tsv")
compariso3n.construct_tree() 


egulon_sig_PB = {}
for i in acute_infected_cell_types:
    p_vals = compariso3n.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
    egulon_sig_PB[i] = p_vals
Acute-Infected_blood_CD4+ Naive -0.49480519480519486
Acute-Infected_blood_CD4+ Treg -0.38701298701298703
Acute-Infected_blood_CD4+ EM -0.32597402597402597
Acute-Infected_blood_CD4+ Proliferating 0.4
Acute-Infected_blood_CD4+ CTL 1.0
Acute-Infected_blood_CD4+ Naive
Acute-Infected_blood_CD4+ Treg
Acute-Infected_blood_CD4+ EM
Acute-Infected_blood_CD4+ Proliferating
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [165]:
set(df_RAS["name"].tolist())
Out[165]:
{'93T', '94T', '95T'}
InĀ [40]:
bonf_list = [egulon_sig_PB[""][i] for i in egulon_sig_PB["Acute-Infected_blood_CD4+ Naive"]]
InĀ [103]:
data = pd.read_csv("SIV_monkey/rss_AcuteInfected.csv") ## this would be one comparison (RSS)
df_RAS = pd.read_csv("SIV_monkey/RAS_AUC_AcuteInfected.csv") ## grab this from your SCENIC stuff, include ALL METADATA AUC AND cellLabels

collect_types.init_types_collection()

acute_infected_cell_types = [
                                "Acute-Infected_blood_CD4+ Naive",
                                "Acute-Infected_blood_CD4+ Treg",
                                "Acute-Infected_blood_CD4+ EM",
                                "Acute-Infected_blood_CD4+ Proliferating",
                                "Acute-Infected_brain_CD4+ CTL",
                                "Acute-Infected_blood_CD4+ CTL"
                                ]
    
    
compariso3n2 = ComparisonTree('Acute-Infected_blood_CD4+ CTL', df_RAS, "SCENIC_RSS_3", data, acute_infected_cell_types, "Unnamed: 0", "SIV_monkey/3.5_AUCellThresholds_Info_Acute_infected.tsv")
compariso3n2.construct_tree() 
    
    
egulon_sig_PB = {}
for i in acute_infected_cell_types:
    p_vals = compariso3n2.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
    egulon_sig_PB[i] = p_vals
Acute-Infected_blood_CD4+ Naive -0.49480519480519486
Acute-Infected_blood_CD4+ Treg -0.38701298701298703
Acute-Infected_blood_CD4+ EM -0.32597402597402597
Acute-Infected_blood_CD4+ Proliferating 0.4
Acute-Infected_brain_CD4+ CTL 0.625974025974026
Acute-Infected_blood_CD4+ CTL 1.0
Acute-Infected_blood_CD4+ Naive
Acute-Infected_blood_CD4+ Treg
Acute-Infected_blood_CD4+ EM
Acute-Infected_blood_CD4+ Proliferating
Acute-Infected_brain_CD4+ CTL
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Bonferroni correction!¶

InĀ [149]:
cluster = "Acute-Infected_blood_CD4+ EM"
InĀ [150]:
bonf_lis= [egulon_sig_PB[cluster][i] for i in egulon_sig_PB[cluster]]
InĀ [151]:
from statsmodels.sandbox.stats.multicomp import multipletests
p_adjusted = multipletests(bonf_lis,alpha=0.01, method='bonferroni')
InĀ [152]:
for i, ele in list(zip(p_adjusted[0].tolist(), egulon_sig_PB[cluster].keys())):
    if i:
        print(ele)
FOS
ZNF250
MXI1
CHD2
RFX1
STAT1
InĀ [109]:
len(compariso3n2.label)
Out[109]:
56
InĀ [298]:
compariso3n.plot_3dEmbedding(regulonsToView=["ZNF250", "MXI1"], 
                             clustersToLabel=["Acute-Infected_blood_CD4+ Naive", "Acute-Infected_blood_CD4+ EM", "Acute-Infected_blood_CD4+ Treg"])
['Acute-Infected_blood_CD4+ Naive', 'Acute-Infected_blood_CD4+ Treg', 'Acute-Infected_blood_CD4+ EM', 'Acute-Infected_blood_CD4+ Proliferating', 'Acute-Infected_blood_CD4+ CTL']
['NFYB', 'BRCA1', 'E2F1', 'TFDP1', 'YY1', 'RELA', 'SP3', 'RELB', 'JUN', 'FOS', 'ZNF250', 'EP300', 'NFKB1', 'ATF6', 'TAF1', 'AHCTF1', 'JUNB', 'MXI1', 'CHD2', 'ELF1', 'ETS1', 'FLI1', 'NR3C1', 'RB1', 'TFDP2', 'IRF4', 'WRNIP1', 'ETV7', 'E2F3', 'RFX5', 'EOMES', 'TBX21', 'DDIT3', 'REL', 'JUND', 'CREM', 'FOSB', 'ATF3', 'NR2C2', 'SIN3A', 'ELF2', 'MLXIP', 'GABPA', 'ATF2', 'SP1', 'E2F4', 'PBX3', 'SMAD5', 'RFX1', 'RXRA', 'ELF4', 'ELK3', 'IRF2', 'IRF1', 'STAT1', 'IRF9']
{'Acute-Infected_blood_CD4+ Naive': '#bc9599', 'Acute-Infected_blood_CD4+ Treg': '#22add5', 'Acute-Infected_blood_CD4+ EM': '#b8d999', 'Acute-Infected_blood_CD4+ Proliferating': '#c88184'}
No description has been provided for this image
InĀ [313]:
a = final_permutation_Test(data, "Acute-Infected_brain_CD4+ CTL", "Acute-Infected_blood_CD4+ CTL")
print(a)
No description has been provided for this image
No description has been provided for this image
0.0002
InĀ [302]:
compariso3n2.compareLayers("Acute-Infected_blood_CD4+ CTL", "Acute-Infected_brain_CD4+ CTL", .05) ## this was just the Acute Infected
No description has been provided for this image
InĀ [175]:
import pandas as pd
from scipy.stats import kendalltau

# Assuming 'df' is your DataFrame with the relevant columns
columns = ["Acute-Infected_blood_CD4+ Naive", "Acute-Infected_blood_CD4+ EM", "Acute-Infected_blood_CD4+ Treg"]

# Initialize an empty dictionary to store Kendall Tau results
kendall_results = {}

# Nested for loop to calculate pairwise Kendall Tau
for col1 in columns:
    kendall_results[col1] = {}
    for col2 in columns:
        tau, _ = kendalltau(data[col1], data[col2])
        kendall_results[col1][col2] = tau

# Convert the dictionary to a pandas DataFrame for better readability
kendall_df = pd.DataFrame(kendall_results)

# Print the DataFrame, which you can copy and paste
print(kendall_df.to_string(index=True))
                                 Acute-Infected_blood_CD4+ Naive  Acute-Infected_blood_CD4+ EM  Acute-Infected_blood_CD4+ Treg
Acute-Infected_blood_CD4+ Naive                         1.000000                      0.631169                        0.744156
Acute-Infected_blood_CD4+ EM                            0.631169                      1.000000                        0.697403
Acute-Infected_blood_CD4+ Treg                          0.744156                      0.697403                        1.000000